#This script finds the words most aligned with the expanded axis of opposition
#It does so by searching in the embedding space for the words most aligned with the axis
#It produces a vector of the 20 words most aligned with the axis of opposition
#It also produces a vector of the 20 words most aligned with the inverse of the axis of opposition
library(dplyr)
library(text2vec)
library(readr)

# Load your opposition and support terms (adjust file paths as needed)
oppterms <- read_csv("data/pretrained_embedding/oppterms_manual.csv") %>%
  filter(include == 1) %>%
  pull(term)

supterms <- read_csv("data/pretrained_embedding/supterms_manual.csv") %>%
  filter(include == 1) %>%
  pull(term)

# Function to find the words closest to the opposition-support axis
find_closest_words <- function(embedding, oppterms, supterms, top_n = 20) {
  # Ensure that the terms exist in the embedding's vocabulary
  valid_opp <- intersect(oppterms, rownames(embedding))
  valid_sup <- intersect(supterms, rownames(embedding))
  
  if (length(valid_opp) == 0 || length(valid_sup) == 0) {
    stop("None of the provided terms are found in the embedding vocabulary!")
  }
  
  # Compute the average vector for the opposition and support terms
  opp_vector <- colMeans(embedding[valid_opp, , drop = FALSE])
  sup_vector <- colMeans(embedding[valid_sup, , drop = FALSE])
  
  # Compute the difference vector (opp minus sup)
  diff_vector <- opp_vector - sup_vector
  
  # Compute cosine similarity between the diff_vector and all words in the embedding
  # sim2 expects x to be the matrix of word vectors and y a matrix (here the diff_vector)
  similarities <- sim2(x = embedding,
                       y = matrix(diff_vector, nrow = 1),
                       method = "cosine", norm = "l2")
  
  # Convert the similarities to a named vector (words as names)
  similarities <- as.vector(similarities)
  names(similarities) <- rownames(embedding)
  
  # Sort the similarities in descending order and return the top_n words
  top_words <- sort(similarities, decreasing = TRUE)[1:top_n]
  return(top_words)
}

# Load specific embedding
version <- "150000030k"
local_glove <- readRDS(
  paste0("data/embedding_combined/combined_local_glove", version, ".rds")
)
# Here, we assume that local_glove is already loaded in your workspace.
top_related_words <- find_closest_words(local_glove, oppterms, supterms, top_n = 20)

# Display the words most aligned with the "opposition" end of your scale
print(top_related_words)

find_closest_words_inverse <- function(embedding, oppterms, supterms, top_n = 20) {
  # Use the same valid term filtering
  valid_opp <- intersect(oppterms, rownames(embedding))
  valid_sup <- intersect(supterms, rownames(embedding))
  
  if (length(valid_opp) == 0 || length(valid_sup) == 0) {
    stop("None of the provided terms are found in the embedding vocabulary!")
  }
  
  # Average vectors
  opp_vector <- colMeans(embedding[valid_opp, , drop = FALSE])
  sup_vector <- colMeans(embedding[valid_sup, , drop = FALSE])
  
  # Invert the difference: (sup - opp)
  diff_vector_inv <- sup_vector - opp_vector
  
  # Compute cosine similarity as before
  similarities <- sim2(x = embedding,
                       y = matrix(diff_vector_inv, nrow = 1),
                       method = "cosine", norm = "l2")
  
  similarities <- as.vector(similarities)
  names(similarities) <- rownames(embedding)
  
  top_words <- sort(similarities, decreasing = TRUE)[1:top_n]
  return(top_words)
}

top_inverse_words <- find_closest_words_inverse(local_glove, oppterms, supterms, top_n = 20)
print(top_inverse_words)
